{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from data_process import tokenizer_process, build_vocab, sentence_to_onehot, cal_idf, sentence_to_tfidf, sentence_to_index\n",
    "from data_process import pos_process, build_vocab_pos, sentence_to_onehot_pos, cal_idf_pos, sentence_to_tfidf_pos, sentence_to_index_pos\n",
    "from data_process import morphs_process, build_vocab_morphs, sentence_to_onehot_morphs, cal_idf_morphs, sentence_to_tfidf_morphs, sentence_to_index_morphs\n",
    "from pprint import pprint"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Data Process with simple tokenizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = ['나는 생각한다 고로 나는 존재한다.',\n",
    "        '모든 국가는 그에 걸맞는 국가를 가진다.',\n",
    "        '이것 또한 지나가리라',\n",
    "        '죄는 미워하되 사람은 미워하지 마라.',\n",
    "        '일찍 일어나는 새가 벌레를 잡는다']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[['나는', '생각한다', '고로', '나는', '존재한다', '.'],\n",
      " ['모든', '국가는', '그에', '걸맞는', '국가를', '가진다', '.'],\n",
      " ['이것', '또한', '지나가리라'],\n",
      " ['죄는', '미워하되', '사람은', '미워하지', '마라', '.'],\n",
      " ['일찍', '일어나는', '새가', '벌레를', '잡는다']]\n"
     ]
    }
   ],
   "source": [
    "pprint(tokenizer_process(data))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "vocab, _, vocab_size = build_vocab(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "dict_keys(['<PAD>', '<UNK>', '.', '나는', '생각한다', '고로', '존재한다', '모든', '국가는', '그에', '걸맞는', '국가를', '가진다', '이것', '또한', '지나가리라', '죄는', '미워하되', '사람은', '미워하지', '마라', '일찍', '일어나는', '새가', '벌레를', '잡는다'])\n"
     ]
    }
   ],
   "source": [
    "pprint(vocab.keys())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "array([[0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
      "        0, 0, 0, 0],\n",
      "       [0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
      "        0, 0, 0, 0],\n",
      "       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0,\n",
      "        0, 0, 0, 0],\n",
      "       [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0,\n",
      "        0, 0, 0, 0],\n",
      "       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,\n",
      "        1, 1, 1, 1]])\n"
     ]
    }
   ],
   "source": [
    "pprint(sentence_to_onehot(data, vocab))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "IDF = cal_idf(data, vocab)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "array([[0.        , 0.        , 0.16735766, 0.91629073, 0.68721805,\n",
      "        0.68721805, 0.68721805, 0.        , 0.        , 0.        ,\n",
      "        0.        , 0.        , 0.        , 0.        , 0.        ,\n",
      "        0.        , 0.        , 0.        , 0.        , 0.        ,\n",
      "        0.        , 0.        , 0.        , 0.        , 0.        ,\n",
      "        0.        ],\n",
      "       [0.        , 0.        , 0.22314355, 0.        , 0.        ,\n",
      "        0.        , 0.        , 0.91629073, 0.91629073, 0.91629073,\n",
      "        0.91629073, 0.91629073, 0.91629073, 0.        , 0.        ,\n",
      "        0.        , 0.        , 0.        , 0.        , 0.        ,\n",
      "        0.        , 0.        , 0.        , 0.        , 0.        ,\n",
      "        0.        ],\n",
      "       [0.        , 0.        , 0.        , 0.        , 0.        ,\n",
      "        0.        , 0.        , 0.        , 0.        , 0.        ,\n",
      "        0.        , 0.        , 0.        , 0.91629073, 0.91629073,\n",
      "        0.91629073, 0.        , 0.        , 0.        , 0.        ,\n",
      "        0.        , 0.        , 0.        , 0.        , 0.        ,\n",
      "        0.        ],\n",
      "       [0.        , 0.        , 0.22314355, 0.        , 0.        ,\n",
      "        0.        , 0.        , 0.        , 0.        , 0.        ,\n",
      "        0.        , 0.        , 0.        , 0.        , 0.        ,\n",
      "        0.        , 0.91629073, 0.91629073, 0.91629073, 0.91629073,\n",
      "        0.91629073, 0.        , 0.        , 0.        , 0.        ,\n",
      "        0.        ],\n",
      "       [0.        , 0.        , 0.        , 0.        , 0.        ,\n",
      "        0.        , 0.        , 0.        , 0.        , 0.        ,\n",
      "        0.        , 0.        , 0.        , 0.        , 0.        ,\n",
      "        0.        , 0.        , 0.        , 0.        , 0.        ,\n",
      "        0.        , 0.91629073, 0.91629073, 0.91629073, 0.91629073,\n",
      "        0.91629073]])\n"
     ]
    }
   ],
   "source": [
    "pprint(sentence_to_tfidf(data, vocab, IDF))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[3, 4, 5, 3, 6, 2, 0],\n",
      " [7, 8, 9, 10, 11, 12, 2],\n",
      " [13, 14, 15, 0, 0, 0, 0],\n",
      " [16, 17, 18, 19, 20, 2, 0],\n",
      " [21, 22, 23, 24, 25, 0, 0]]\n"
     ]
    }
   ],
   "source": [
    "pprint(sentence_to_index(data, vocab))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[3, 4, 5, 3, 6, 2, 0, 0, 0, 0],\n",
      " [7, 8, 9, 10, 11, 12, 2, 0, 0, 0],\n",
      " [13, 14, 15, 0, 0, 0, 0, 0, 0, 0],\n",
      " [16, 17, 18, 19, 20, 2, 0, 0, 0, 0],\n",
      " [21, 22, 23, 24, 25, 0, 0, 0, 0, 0]]\n"
     ]
    }
   ],
   "source": [
    "pprint(sentence_to_index(data,vocab, 10))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Data Process with Twitter pos(part of speech) extractor"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[['나', '생각', '하다', '고로', '나', '존재', '하다'],\n",
      " ['모든', '국가', '그', '걸', '맞다', '국가', '가지다'],\n",
      " ['것', '또한', '지나가다'],\n",
      " ['죄', '미워하다', '사람', '미워하다', '마르다'],\n",
      " ['일찍', '일어나다', '새', '벌레', '잡다']]\n"
     ]
    }
   ],
   "source": [
    "pprint(pos_process(data))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "vocab, _, vocab_size = build_vocab_pos(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "dict_keys(['<PAD>', '<UNK>', '나', '하다', '국가', '미워하다', '생각', '고로', '존재', '모든', '그', '걸', '맞다', '가지다', '것', '또한', '지나가다', '죄', '사람', '마르다', '일찍', '일어나다', '새', '벌레', '잡다'])\n"
     ]
    }
   ],
   "source": [
    "pprint(vocab.keys())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "array([[0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
      "        0, 0, 0],\n",
      "       [0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,\n",
      "        0, 0, 0],\n",
      "       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,\n",
      "        0, 0, 0],\n",
      "       [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0,\n",
      "        0, 0, 0],\n",
      "       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,\n",
      "        1, 1, 1]])\n"
     ]
    }
   ],
   "source": [
    "pprint(sentence_to_onehot_pos(data, vocab))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "IDF = cal_idf_pos(data, vocab)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "array([[0.        , 0.        , 0.91629073, 0.91629073, 0.        ,\n",
      "        0.        , 0.68721805, 0.68721805, 0.68721805, 0.        ,\n",
      "        0.        , 0.        , 0.        , 0.        , 0.        ,\n",
      "        0.        , 0.        , 0.        , 0.        , 0.        ,\n",
      "        0.        , 0.        , 0.        , 0.        , 0.        ],\n",
      "       [0.        , 0.        , 0.        , 0.        , 0.91629073,\n",
      "        0.        , 0.        , 0.        , 0.        , 0.68721805,\n",
      "        0.68721805, 0.68721805, 0.68721805, 0.68721805, 0.        ,\n",
      "        0.        , 0.        , 0.        , 0.        , 0.        ,\n",
      "        0.        , 0.        , 0.        , 0.        , 0.        ],\n",
      "       [0.        , 0.        , 0.        , 0.        , 0.        ,\n",
      "        0.        , 0.        , 0.        , 0.        , 0.        ,\n",
      "        0.        , 0.        , 0.        , 0.        , 0.91629073,\n",
      "        0.91629073, 0.91629073, 0.        , 0.        , 0.        ,\n",
      "        0.        , 0.        , 0.        , 0.        , 0.        ],\n",
      "       [0.        , 0.        , 0.        , 0.        , 0.        ,\n",
      "        0.91629073, 0.        , 0.        , 0.        , 0.        ,\n",
      "        0.        , 0.        , 0.        , 0.        , 0.        ,\n",
      "        0.        , 0.        , 0.68721805, 0.68721805, 0.68721805,\n",
      "        0.        , 0.        , 0.        , 0.        , 0.        ],\n",
      "       [0.        , 0.        , 0.        , 0.        , 0.        ,\n",
      "        0.        , 0.        , 0.        , 0.        , 0.        ,\n",
      "        0.        , 0.        , 0.        , 0.        , 0.        ,\n",
      "        0.        , 0.        , 0.        , 0.        , 0.        ,\n",
      "        0.91629073, 0.91629073, 0.91629073, 0.91629073, 0.91629073]])\n"
     ]
    }
   ],
   "source": [
    "pprint(sentence_to_tfidf_pos(data, vocab, IDF))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[2, 6, 3, 7, 2, 8, 3],\n",
      " [9, 4, 10, 11, 12, 4, 13],\n",
      " [14, 15, 16, 0, 0, 0, 0],\n",
      " [17, 5, 18, 5, 19, 0, 0],\n",
      " [20, 21, 22, 23, 24, 0, 0]]\n"
     ]
    }
   ],
   "source": [
    "pprint(sentence_to_index_pos(data, vocab))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[2, 6, 3, 7, 2, 8, 3, 0, 0, 0],\n",
      " [9, 4, 10, 11, 12, 4, 13, 0, 0, 0],\n",
      " [14, 15, 16, 0, 0, 0, 0, 0, 0, 0],\n",
      " [17, 5, 18, 5, 19, 0, 0, 0, 0, 0],\n",
      " [20, 21, 22, 23, 24, 0, 0, 0, 0, 0]]\n"
     ]
    }
   ],
   "source": [
    "pprint(sentence_to_index_pos(data,vocab, 10))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Data Process with Twitter morphs extractor"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[['나', '는', '생각한', '다', '고로', '나', '는', '존재한', '다', '.'],\n",
      " ['모든', '국가', '는', '그', '에', '걸', '맞는', '국가', '를', '가진', '다', '.'],\n",
      " ['이', '것', '또한', '지나가', '리라'],\n",
      " ['죄', '는', '미워하', '되', '사람', '은', '미워하지', '마라', '.'],\n",
      " ['일찍', '일어나는', '새', '가', '벌레', '를', '잡는', '다']]\n"
     ]
    }
   ],
   "source": [
    "pprint(morphs_process(data))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "vocab, _, vocab_size = build_vocab_morphs(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "dict_keys(['<PAD>', '<UNK>', '는', '다', '.', '나', '국가', '를', '생각한', '고로', '존재한', '모든', '그', '에', '걸', '맞는', '가진', '이', '것', '또한', '지나가', '리라', '죄', '미워하', '되', '사람', '은', '미워하지', '마라', '일찍', '일어나는', '새', '가', '벌레', '잡는'])\n"
     ]
    }
   ],
   "source": [
    "pprint(vocab.keys())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "array([[0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
      "        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
      "       [0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,\n",
      "        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
      "       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,\n",
      "        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
      "       [0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
      "        1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0],\n",
      "       [0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
      "        0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]])\n"
     ]
    }
   ],
   "source": [
    "pprint(sentence_to_onehot_morphs(data, vocab))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "IDF = cal_idf_morphs(data, vocab)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "array([[0.        , 0.        , 0.22314355, 0.22314355, 0.16735766,\n",
      "        0.91629073, 0.        , 0.        , 0.68721805, 0.68721805,\n",
      "        0.68721805, 0.        , 0.        , 0.        , 0.        ,\n",
      "        0.        , 0.        , 0.        , 0.        , 0.        ,\n",
      "        0.        , 0.        , 0.        , 0.        , 0.        ,\n",
      "        0.        , 0.        , 0.        , 0.        , 0.        ,\n",
      "        0.        , 0.        , 0.        , 0.        , 0.        ],\n",
      "       [0.        , 0.        , 0.16735766, 0.16735766, 0.16735766,\n",
      "        0.        , 0.91629073, 0.38311922, 0.        , 0.        ,\n",
      "        0.        , 0.68721805, 0.68721805, 0.68721805, 0.68721805,\n",
      "        0.68721805, 0.68721805, 0.        , 0.        , 0.        ,\n",
      "        0.        , 0.        , 0.        , 0.        , 0.        ,\n",
      "        0.        , 0.        , 0.        , 0.        , 0.        ,\n",
      "        0.        , 0.        , 0.        , 0.        , 0.        ],\n",
      "       [0.        , 0.        , 0.        , 0.        , 0.        ,\n",
      "        0.        , 0.        , 0.        , 0.        , 0.        ,\n",
      "        0.        , 0.        , 0.        , 0.        , 0.        ,\n",
      "        0.        , 0.        , 0.91629073, 0.91629073, 0.91629073,\n",
      "        0.91629073, 0.91629073, 0.        , 0.        , 0.        ,\n",
      "        0.        , 0.        , 0.        , 0.        , 0.        ,\n",
      "        0.        , 0.        , 0.        , 0.        , 0.        ],\n",
      "       [0.        , 0.        , 0.22314355, 0.        , 0.22314355,\n",
      "        0.        , 0.        , 0.        , 0.        , 0.        ,\n",
      "        0.        , 0.        , 0.        , 0.        , 0.        ,\n",
      "        0.        , 0.        , 0.        , 0.        , 0.        ,\n",
      "        0.        , 0.        , 0.91629073, 0.91629073, 0.91629073,\n",
      "        0.91629073, 0.91629073, 0.91629073, 0.91629073, 0.        ,\n",
      "        0.        , 0.        , 0.        , 0.        , 0.        ],\n",
      "       [0.        , 0.        , 0.        , 0.22314355, 0.        ,\n",
      "        0.        , 0.        , 0.51082562, 0.        , 0.        ,\n",
      "        0.        , 0.        , 0.        , 0.        , 0.        ,\n",
      "        0.        , 0.        , 0.        , 0.        , 0.        ,\n",
      "        0.        , 0.        , 0.        , 0.        , 0.        ,\n",
      "        0.        , 0.        , 0.        , 0.        , 0.91629073,\n",
      "        0.91629073, 0.91629073, 0.91629073, 0.91629073, 0.91629073]])\n"
     ]
    }
   ],
   "source": [
    "pprint(sentence_to_tfidf_morphs(data, vocab, IDF))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[5, 2, 8, 3, 9, 5, 2, 10, 3, 4, 0, 0],\n",
      " [11, 6, 2, 12, 13, 14, 15, 6, 7, 16, 3, 4],\n",
      " [17, 18, 19, 20, 21, 0, 0, 0, 0, 0, 0, 0],\n",
      " [22, 2, 23, 24, 25, 26, 27, 28, 4, 0, 0, 0],\n",
      " [29, 30, 31, 32, 33, 7, 34, 3, 0, 0, 0, 0]]\n"
     ]
    }
   ],
   "source": [
    "pprint(sentence_to_index_morphs(data, vocab))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[5, 2, 8, 3, 9, 5, 2, 10, 3, 4],\n",
      " [11, 6, 2, 12, 13, 14, 15, 6, 7, 16],\n",
      " [17, 18, 19, 20, 21, 0, 0, 0, 0, 0],\n",
      " [22, 2, 23, 24, 25, 26, 27, 28, 4, 0],\n",
      " [29, 30, 31, 32, 33, 7, 34, 3, 0, 0]]\n"
     ]
    }
   ],
   "source": [
    "pprint(sentence_to_index_morphs(data,vocab, 10))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python tensor",
   "language": "python",
   "name": "myenv"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
